library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(TSA)
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(MTS)
library(lmtest)
library(ggplot2)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
df <- read.csv('CTA.csv')
df$service_date <- as.Date(df$service_date, format="%m/%d/%Y")
df <- df[order(df$service_date), ]
df <- df[!duplicated(df$service_date),]
pre.covid <- df[df$service_date < as.Date("2020-03-13"),]
post.covid <- df[df$service_date >= as.Date("2020-03-13"),]
pre.covid <- pre.covid[order(pre.covid$service_date), ]
post.covid <- post.covid[order(post.covid$service_date), ]
row.names(pre.covid) <- NULL
row.names(post.covid) <- NULL
On March 19th, 2020, Chicago declared a state of emergency for the COVID 19 Pandemic, which lines up with the significant drop of ridership here. The United states declared a state of emergency for the COVID 19 pandemic on March 13th, 2020, which explains the initial drop a few days earlier. The data has been split to pre-COVID and post-COVID for analysis.
Total rides is the sum of bus and rail_boardings. We can do a quick check to ensure this is the case, which it is.
all(df$rail_boardings + df$bus == df$total_rides)
## [1] TRUE
Based on the below ACF plot, there is very strong weekly seasonality in the data. Each day of the week tends to behave like previous same days of the week. There is an extremely strong autoregressive process that does not die down, suggesting non stationarity. The variance is not changing over time, but it does trend upwards as time goes on just before the COVID pandemic.
plot(pre.covid$service_date, pre.covid$total_rides, type='l')
acf(pre.covid$total_rides, 200)
The data is not stationary, so we need to difference it. The results of the below tests confirm that it is indeed stationary now.
kpss.test(diff(pre.covid$total_rides), null='L')
##
## KPSS Test for Level Stationarity
##
## data: diff(pre.covid$total_rides)
## KPSS Level = 0.0020757, Truncation lag parameter = 11, p-value = 0.1
adf.test(diff(pre.covid$total_rides))
##
## Augmented Dickey-Fuller Test
##
## data: diff(pre.covid$total_rides)
## Dickey-Fuller = -33.711, Lag order = 19, p-value = 0.01
## alternative hypothesis: stationary
From the STL plot below, in addition to short term seasonality of weekly, there is also annual seasonality given by the similarity in the trend lines at the 1 year marks. The time is in weeks.
tot.ts <- ts(pre.covid$total_rides[1:1000], frequency=7)
plot(stl(tot.ts, s.window='periodic'), main='Pre-COVID Total Rides STL Decomposition ( 2001 - 2003)')
Based on the below ACF plot, there is very strong weekly seasonality in the data. Each day of the week tends to behave like previous same days of the week. There is an extremely strong autoregressive process that does not die down, suggesting non stationarity. The variance is not changing over time, but it does trend upwards as time goes on just before the COVID pandemic.
plot(pre.covid$service_date, pre.covid$rail_boardings, type='l')
acf(pre.covid$rail_boardings, 200)
The data is not stationary, so we need to difference it. The results of the below tests confirm that it is indeed stationary now.
kpss.test(diff(pre.covid$rail_boardings), null='L')
##
## KPSS Test for Level Stationarity
##
## data: diff(pre.covid$rail_boardings)
## KPSS Level = 0.0019168, Truncation lag parameter = 11, p-value = 0.1
adf.test(diff(pre.covid$rail_boardings))
##
## Augmented Dickey-Fuller Test
##
## data: diff(pre.covid$rail_boardings)
## Dickey-Fuller = -34.138, Lag order = 19, p-value = 0.01
## alternative hypothesis: stationary
From the STL plot below, in addition to short term seasonality of weekly, there is also annual seasonality given by the similarity in the trend lines at the 1 year marks. The time is in weeks.
rail.ts <- ts(pre.covid$rail_boardings[1:1000], frequency=7)
plot(stl(rail.ts, s.window='periodic'), main='Pre-COVID Total Rides STL Decomposition (2001 - 2003)')
pacf(pre.covid$total_rides, 200, main="Autocorrelation of Pre-COVID Total Ridership")
The data is not stationary, so we need to difference it. The results of the below tests confirm that it is indeed stationary now.
kpss.test(diff(pre.covid$bus), null='L')
##
## KPSS Test for Level Stationarity
##
## data: diff(pre.covid$bus)
## KPSS Level = 0.0022188, Truncation lag parameter = 11, p-value = 0.1
adf.test(diff(pre.covid$bus))
##
## Augmented Dickey-Fuller Test
##
## data: diff(pre.covid$bus)
## Dickey-Fuller = -33.495, Lag order = 19, p-value = 0.01
## alternative hypothesis: stationary
bus.ts <- ts(pre.covid$bus, frequency=7)
plot(stl(bus.ts, s.window='periodic'), main='Pre-COVID Bus STL Decomposition')
Rail boardings also exhibit a very long memory process, suggesting seasonality at the weekly level as noted by the ACF peaks.
plot(df$bus, type='l')
plot(post.covid$service_date, post.covid$rail_boardings, type='l')
acf(post.covid$rail_boardings, 400)
The data is not stationary, so we need to difference it. The results of the below tests confirm that it is indeed stationary when differencing.
kpss.test(diff(post.covid$rail_boardings), null='L')
##
## KPSS Test for Level Stationarity
##
## data: diff(post.covid$rail_boardings)
## KPSS Level = 0.062211, Truncation lag parameter = 7, p-value = 0.1
adf.test(diff(post.covid$rail_boardings))
##
## Augmented Dickey-Fuller Test
##
## data: diff(post.covid$rail_boardings)
## Dickey-Fuller = -17.66, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
From the STL plot below, in addition to short term seasonality of weekly, there is also annual seasonality given by the similarity in the trend lines at the 1 year marks. The time is in weeks.
rail.ts <- ts(post.covid$total_rides, frequency=7)
plot(stl(rail.ts, s.window='periodic'),main='Post-COVID Total Rides STL Decomposition')
In bus boardings, there is a heteroscedastic upward trend from the initial drop in ridership in March 2020. The ACF plot shows similar weekly similarity in the pre-COVID era.
plot(post.covid$service_date, post.covid$bus, type='l')
acf(post.covid$bus, 200)
The data is not stationary, so we need to difference it. The results of the below tests confirm that it is indeed stationary now.
kpss.test(diff(post.covid$bus), null='L')
##
## KPSS Test for Level Stationarity
##
## data: diff(post.covid$bus)
## KPSS Level = 0.04096, Truncation lag parameter = 7, p-value = 0.1
adf.test(diff(post.covid$bus))
##
## Augmented Dickey-Fuller Test
##
## data: diff(post.covid$bus)
## Dickey-Fuller = -16.655, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
There is a similar trend in the Post-COVID era as the pre-COVID era in terms of yearly seasonality, albeit much lower ridership.
bus.ts <- ts(post.covid$bus, frequency=7)
plot(stl(bus.ts, s.window='periodic'), main='Post-COVID Bus STL Decomposition')
We will model both Bus and Rail using intervention analysis given the COVID-19 effect on March 13, 2020.
intervention_date <- as.Date("2020-03-13")
train_set <- df[df$service_date < as.Date("2024-01-01"), ]
test_set <- df[df$service_date >= as.Date("2024-01-01"), ]
K_weekly <- 3
K_monthly <- 6
K_yearly <- 12
ts_bus <- msts(df$bus, seasonal.periods = c(7, 30, 365.25))
# Create Fourier terms for weekly and yearly seasonality for the entire dataset
fourier_terms_bus <- fourier(ts_bus, K = c(K_weekly, K_monthly, K_yearly))
fourier_terms_bus_train <- fourier_terms_bus[1:nrow(train_set), ]
fourier_terms_bus_test <- fourier_terms_bus[(nrow(train_set) + 1):nrow(df), ]
fourier_terms_bus_precovid <- fourier_terms_bus[1:nrow(pre.covid), ]
baseline_arima_bus <- auto.arima(log(pre.covid$bus), xreg = fourier_terms_bus_precovid, seasonal=FALSE)
arima_orders_bus <- c(baseline_arima_bus$arma[1], baseline_arima_bus$arma[6], baseline_arima_bus$arma[2])
pulse_train <- 1*(seq(train_set$bus) == 6708)
first.model.x <- arimax(log(train_set$bus), order=arima_orders_bus, xtransf=data.frame(pulse_train), xreg=fourier_terms_bus_train, transfer=list(c(1,0)))
## Warning in arimax(log(train_set$bus), order = arima_orders_bus, xtransf =
## data.frame(pulse_train), : possible convergence problem: optim gave code=1
summary(first.model.x)
##
## Call:
## arimax(x = log(train_set$bus), order = arima_orders_bus, xreg = fourier_terms_bus_train,
## xtransf = data.frame(pulse_train), transfer = list(c(1, 0)))
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 0.3382 0.1959 -0.9498 0.1824 -1.0126 -0.1467 1.0579 -0.7891
## s.e. 0.0305 0.0157 0.0216 0.0170 0.0279 0.0244 0.0275 0.0252
## S1.7 C1.7 S2.7 C2.7 S3.7 C3.7 S1.30 C1.30
## 0.0927 -0.3137 0.0973 -0.1933 0.0483 -0.0916 -0.0018 -0.0021
## s.e. 0.0035 0.0035 0.0019 0.0019 0.0017 0.0017 0.0027 0.0027
## S2.30 C2.30 S3.30 C3.30 S4.30 C4.30 S5.30 C5.30 S6.30
## 0.0027 0.0004 0.0013 0.0026 0.0015 -0.0002 0.0004 0.0006 -0.0007
## s.e. 0.0026 0.0026 0.0025 0.0025 0.0029 0.0029 0.0023 0.0023 0.0021
## C6.30 S1.365 C1.365 S2.365 C2.365 S3.365 C3.365 S4.365
## -0.0003 -0.0217 -0.0314 -0.0009 -0.0646 0.0234 -0.0358 0.0018
## s.e. 0.0021 0.0113 0.0113 0.0061 0.0061 0.0045 0.0045 0.0038
## C4.365 S5.365 C5.365 S6.365 C6.365 S7.365 C7.365 S8.365
## -0.0225 -0.0096 -0.0145 0.0035 -0.0151 0.007 -0.0147 -0.0098
## s.e. 0.0038 0.0034 0.0034 0.0032 0.0032 0.003 0.0030 0.0029
## C8.365 S9.365 C9.365 S10.365 C10.365 S11.365 C11.365 S12.365
## -0.0335 0.0072 -0.0228 0.0132 -0.0336 0.0157 -0.0117 0.0298
## s.e. 0.0029 0.0028 0.0028 0.0028 0.0028 0.0027 0.0027 0.0027
## C12.365 pulse_train-AR1 pulse_train-MA0
## -0.0171 0.3381 -0.0288
## s.e. 0.0027 NaN 0.1176
##
## sigma^2 estimated as 0.01907: log likelihood = 4709.97, aic = -9315.95
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004849168 0.1380856 0.07509408 -0.01418438 0.5702983 0.2919624
## ACF1
## Training set 0.006318137
steps.ahead = length(test_set$bus)
intervention_index <- 6708
decay_rate <- 0.0005
# Create the slowly decaying ramp function
create_ramp <- function(length, start_idx, decay_rate) {
ramp <- numeric(length)
for (i in start_idx:length) {
ramp[i] <- exp(-decay_rate * (i - start_idx))
}
return(ramp)
}
# Create the ramp function for the entire period (train + test)
total_length <- length(train_set$bus) + steps.ahead
ramp_function <- create_ramp(total_length, intervention_index, decay_rate)
# Extract the ramp function for the training period
tf <- -ramp_function
xreg_train <- cbind(fourier_terms_bus_train, tf[1:(length(tf) - steps.ahead)])
bus.forecast.arima <- Arima(log(train_set$bus), order=arima_orders_bus, xreg = xreg_train)
The below plot shows how the intervention strength begins at -1 and slowly decays as time moves on. The strength of the COVID intervention is still present even four years after the initial intervention.
plot(df$service_date, tf, main='Display of Intervention Strength Regressor')
### Forecasts
The ARIMAX forecasts are slightly narrower than the actual boardings, but the seasonality is captured well by the fourier series.
start_idx <- length(tf) - steps.ahead + 1
xreg_forecast <- cbind(fourier_terms_bus_test, tf[start_idx:length(tf)]+1)
pred <- predict(bus.forecast.arima, n.ahead = steps.ahead, newxreg = xreg_forecast)
preds <- exp(pred$pred)
bus_plot_data <- data.frame(
Date = test_set$service_date,
Actual = test_set$bus,
Forecasted = preds
)
ggplot(bus_plot_data, aes(x = Date)) +
geom_line(aes(y = Actual, color = "Actual")) +
geom_line(aes(y = Forecasted, color = "Forecasted")) +
labs(title = "Bus Boardings: Actual vs Forecasted",
y = "Boardings",
x = "Date (2024)") +
scale_color_manual(name = "Legend", values = c("Actual" = "blue", "Forecasted" = "red")) +
theme_minimal()
rmse_value <- rmse(bus_plot_data$Actual, bus_plot_data$Forecasted)
mae_value <- mae(bus_plot_data$Actual, bus_plot_data$Forecasted)
mape_value <- mape(bus_plot_data$Actual, bus_plot_data$Forecasted)
tolerance <- 0.3
coverage_value <- mean(abs(bus_plot_data$Actual - bus_plot_data$Forecasted) / bus_plot_data$Actual <= tolerance) * 100
# Print the results
cat("2024 Forecasts for Bus Ridership Metrics \n")
## 2024 Forecasts for Bus Ridership Metrics
cat("RMSE:", rmse_value, "\n")
## RMSE: 73877.61
cat("MAE:", mae_value, "\n")
## MAE: 45003.07
cat("MAPE:", mape_value, "\n")
## MAPE: 0.1383404
cat("Coverage:", coverage_value, "%\n")
## Coverage: 90 %
# Define the intervention date
intervention_date <- as.Date("2020-03-13")
train_set <- df[df$service_date < as.Date("2024-01-01"), ]
test_set <- df[df$service_date >= as.Date("2024-01-01"), ]
K_weekly <- 3
K_monthly <- 6
K_yearly <- 12
ts_rail <- msts(df$rail_boardings, seasonal.periods = c(7, 30, 365.25))
fourier_terms_rail <- fourier(ts_rail, K = c(K_weekly, K_monthly, K_yearly))
fourier_terms_rail_train <- fourier_terms_rail[1:nrow(train_set), ]
fourier_terms_rail_test <- fourier_terms_rail[(nrow(train_set) + 1):nrow(df), ]
fourier_terms_rail_precovid <- fourier_terms_rail[1:nrow(pre.covid), ]
baseline_arima_rail <- auto.arima(log(pre.covid$rail_boardings), xreg = fourier_terms_rail_precovid, seasonal=FALSE)
summary(baseline_arima_rail)
## Series: log(pre.covid$rail_boardings)
## Regression with ARIMA(0,1,1) errors
##
## Coefficients:
## ma1 S1-7 C1-7 S2-7 C2-7 S3-7 C3-7 S1-30
## -0.9900 0.1064 -0.3560 0.1247 -0.2077 0.0600 -0.0853 -0.0024
## s.e. 0.0013 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027
## C1-30 S2-30 C2-30 S3-30 C3-30 S4-30 C4-30 S5-30 C5-30
## 0.0001 0.0015 0.0017 0.0016 0.0039 0.0026 0.0000 0.0012 -0.0001
## s.e. 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027
## S6-30 C6-30 S1-365 C1-365 S2-365 C2-365 S3-365 C3-365
## -0.0004 0.0003 -0.0381 -0.0796 -0.0018 -0.0602 0.0161 -0.0482
## s.e. 0.0027 0.0027 0.0031 0.0031 0.0028 0.0028 0.0027 0.0027
## S4-365 C4-365 S5-365 C5-365 S6-365 C6-365 S7-365 C7-365
## 0.0171 -0.0133 -0.0007 -0.0209 -0.0003 -0.0202 0.0148 -0.0191
## s.e. 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027
## S8-365 C8-365 S9-365 C9-365 S10-365 C10-365 S11-365 C11-365
## -0.0100 -0.0322 0.0115 -0.0260 0.0126 -0.0296 0.0212 -0.0187
## s.e. 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027 0.0027
## S12-365 C12-365
## 0.0336 -0.0262
## s.e. 0.0027 0.0027
##
## sigma^2 = 0.02504: log likelihood = 2997.28
## AIC=-5906.56 AICc=-5905.99 BIC=-5604.94
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002849893 0.1577315 0.08931229 0.006527449 0.6909916 0.322982
## ACF1
## Training set 0.2481906
arima_orders_rail <- c(baseline_arima_rail$arma[1], baseline_arima_rail$arma[6], baseline_arima_rail$arma[2])
pulse_train <- 1*(seq(train_set$rail_boardings) == 6708)
first.model.x <- arimax(log(train_set$rail_boardings), order=arima_orders_rail, xtransf=data.frame(pulse_train), xreg=fourier_terms_rail_train, transfer=list(c(1,0)))
summary(first.model.x)
##
## Call:
## arimax(x = log(train_set$rail_boardings), order = arima_orders_rail, xreg = fourier_terms_rail_train,
## xtransf = data.frame(pulse_train), transfer = list(c(1, 0)))
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ma1 S1.7 C1.7 S2.7 C2.7 S3.7 C3.7 S1.30
## -0.8674 0.0958 -0.3370 0.1115 -0.1943 0.0531 -0.0812 -0.0015
## s.e. 0.0063 0.0025 0.0025 0.0025 0.0025 0.0025 0.0025 0.0030
## C1.30 S2.30 C2.30 S3.30 C3.30 S4.30 C4.30 S5.30 C5.30
## -0.0012 0.0034 0.0010 0.0002 0.0035 0.0014 0.0004 0.0006 -0.0005
## s.e. 0.0030 0.0026 0.0026 0.0025 0.0025 0.0025 0.0025 0.0025 0.0025
## S6.30 C6.30 S1.365 C1.365 S2.365 C2.365 S3.365 C3.365
## 0.0001 0.0001 -0.0581 -0.0632 0.0137 -0.0563 0.0200 -0.0564
## s.e. 0.0025 0.0025 0.0204 0.0204 0.0104 0.0104 0.0072 0.0072
## S4.365 C4.365 S5.365 C5.365 S6.365 C6.365 S7.365 C7.365
## 0.0123 -0.0165 -0.0063 -0.0189 0.0005 -0.0164 0.0170 -0.0182
## s.e. 0.0056 0.0056 0.0047 0.0047 0.0042 0.0042 0.0038 0.0038
## S8.365 C8.365 S9.365 C9.365 S10.365 C10.365 S11.365 C11.365
## -0.0086 -0.0345 0.0104 -0.0273 0.0088 -0.0294 0.0225 -0.0169
## s.e. 0.0035 0.0035 0.0033 0.0033 0.0032 0.0032 0.0031 0.0031
## S12.365 C12.365 pulse_train-AR1 pulse_train-MA0
## 0.0341 -0.0255 0.0265 -0.0169
## s.e. 0.0030 0.0030 NaN 0.1068
##
## sigma^2 estimated as 0.02903: log likelihood = 2945.25, aic = -5800.5
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.324726e-05 0.1703759 0.1016747 -0.0170441 0.7986523 0.3862901
## ACF1
## Training set 0.2475161
steps.ahead = length(test_set$rail_boardings)
intervention_index <- 6708
decay_rate <- 0.0005
# Create the slowly decaying ramp function
create_ramp <- function(length, start_idx, decay_rate) {
ramp <- numeric(length)
for (i in start_idx:length) {
ramp[i] <- exp(-decay_rate * (i - start_idx))
}
return(ramp)
}
# Create the ramp function for the entire period (train + test)
total_length <- length(train_set$bus) + steps.ahead
ramp_function <- create_ramp(total_length, intervention_index, decay_rate)
# Extract the ramp function for the training period
tf <- -ramp_function
xreg_train <- cbind(fourier_terms_rail_train, tf[1:(length(tf) - steps.ahead)])
rail.forecast.arima <- Arima(log(train_set$rail_boardings), order=arima_orders_rail, xreg = xreg_train)
rail.forecast.arima
## Series: log(train_set$rail_boardings)
## Regression with ARIMA(0,1,1) errors
##
## Coefficients:
## ma1 S1-7 C1-7 S2-7 C2-7 S3-7 C3-7 S1-30
## -0.8674 0.0958 -0.3370 0.1115 -0.1943 0.0531 -0.0812 -0.0015
## s.e. 0.0063 0.0025 0.0025 0.0025 0.0025 0.0025 0.0025 0.0030
## C1-30 S2-30 C2-30 S3-30 C3-30 S4-30 C4-30 S5-30 C5-30
## -0.0013 0.0034 0.0010 0.0001 0.0036 0.0014 0.0004 0.0006 -0.0006
## s.e. 0.0030 0.0026 0.0026 0.0025 0.0025 0.0025 0.0025 0.0025 0.0025
## S6-30 C6-30 S1-365 C1-365 S2-365 C2-365 S3-365 C3-365
## 0.0001 0.0001 -0.0601 -0.0632 0.0141 -0.0563 0.0201 -0.0564
## s.e. 0.0025 0.0025 0.0205 0.0204 0.0104 0.0104 0.0072 0.0072
## S4-365 C4-365 S5-365 C5-365 S6-365 C6-365 S7-365 C7-365
## 0.0121 -0.0165 -0.0063 -0.0189 0.0006 -0.0164 0.0170 -0.0182
## s.e. 0.0056 0.0056 0.0047 0.0047 0.0042 0.0042 0.0038 0.0038
## S8-365 C8-365 S9-365 C9-365 S10-365 C10-365 S11-365 C11-365
## -0.0086 -0.0345 0.0104 -0.0273 0.0088 -0.0293 0.0226 -0.0169
## s.e. 0.0035 0.0035 0.0033 0.0033 0.0032 0.0032 0.0031 0.0031
## S12-365 C12-365
## 0.0341 -0.0255 0.0166
## s.e. 0.0030 0.0030 0.0861
##
## sigma^2 = 0.02918: log likelihood = 2945.27
## AIC=-5800.54 AICc=-5800.05 BIC=-5483.93
start_idx <- length(tf) - steps.ahead + 1
xreg_forecast <- cbind(fourier_terms_rail_test, tf[start_idx:length(tf)])
pred <- predict(rail.forecast.arima, n.ahead = steps.ahead, newxreg = xreg_forecast)
preds <- exp(pred$pred)
rail_plot_data <- data.frame(
Date = test_set$service_date,
Actual = test_set$rail_boardings,
Forecasted = preds
)
ggplot(rail_plot_data, aes(x = Date)) +
geom_line(aes(y = Actual, color = "Actual")) +
geom_line(aes(y = Forecasted, color = "Forecasted")) +
labs(title = "Rail Boardings: Actual vs Forecasted",
y = "Boardings",
x = "Date (2024)") +
scale_color_manual(name = "Legend", values = c("Actual" = "blue", "Forecasted" = "red")) +
theme_minimal()
rmse_value <- rmse(rail_plot_data$Actual, rail_plot_data$Forecasted)
mae_value <- mae(rail_plot_data$Actual, rail_plot_data$Forecasted)
mape_value <- mape(rail_plot_data$Actual, rail_plot_data$Forecasted)
# Calculate Coverage
tolerance <- 0.3
coverage_value <- mean(abs(rail_plot_data$Actual - rail_plot_data$Forecasted) / rail_plot_data$Actual <= tolerance) * 100
# Print the results
cat("2024 Forecasts for Rail Ridership Metrics \n")
## 2024 Forecasts for Rail Ridership Metrics
cat("RMSE:", rmse_value, "\n")
## RMSE: 56429.06
cat("MAE:", mae_value, "\n")
## MAE: 32889.26
cat("MAPE:", mape_value, "\n")
## MAPE: 0.1544448
cat("Coverage:", coverage_value, "%\n")
## Coverage: 88.33333 %
# Define the intervention date
intervention_date <- as.Date("2020-03-13")
train_set <- df[df$service_date < as.Date("2024-01-01"), ]
test_set <- df[df$service_date >= as.Date("2024-01-01"), ]
K_weekly <- 3
K_monthly <- 6
K_yearly <- 12
ts_tot <- msts(df$total_rides, seasonal.periods = c(7, 30, 365.25))
fourier_terms_tot <- fourier(ts_tot, K = c(K_weekly, K_monthly, K_yearly))
fourier_terms_tot_train <- fourier_terms_tot[1:nrow(train_set), ]
fourier_terms_tot_test <- fourier_terms_tot[(nrow(train_set) + 1):nrow(df), ]
fourier_terms_tot_precovid <- fourier_terms_tot[1:nrow(pre.covid), ]
baseline_arima_tot <- auto.arima(log(pre.covid$total_rides), xreg = fourier_terms_tot_precovid, seasonal=FALSE)
summary(baseline_arima_tot)
## Series: log(pre.covid$total_rides)
## Regression with ARIMA(2,1,0) errors
##
## Coefficients:
## ar1 ar2 S1-7 C1-7 S2-7 C2-7 S3-7 C3-7
## -0.4271 -0.2562 0.0987 -0.3332 0.1102 -0.2024 0.0543 -0.0902
## s.e. 0.0116 0.0116 0.0023 0.0023 0.0023 0.0023 0.0018 0.0018
## S1-30 C1-30 S2-30 C2-30 S3-30 C3-30 S4-30 C4-30 S5-30
## -0.0016 -0.0006 0.0021 0.0013 0.0021 0.0037 0.0028 0.0002 0.0010
## s.e. 0.0078 0.0078 0.0041 0.0041 0.0029 0.0029 0.0024 0.0024 0.0022
## C5-30 S6-30 C6-30 S1-365 C1-365 S2-365 C2-365 S3-365
## 0.0006 -0.0007 0.0005 -0.0196 -0.0533 -0.0024 -0.0634 0.0224
## s.e. 0.0022 0.0021 0.0021 0.0933 0.0935 0.0468 0.0467 0.0311
## C3-365 S4-365 C4-365 S5-365 C5-365 S6-365 C6-365 S7-365
## -0.0388 0.0119 -0.0170 -0.0027 -0.0180 0.0031 -0.0195 0.0102
## s.e. 0.0312 0.0234 0.0234 0.0187 0.0187 0.0156 0.0156 0.0134
## C7-365 S8-365 C8-365 S9-365 C9-365 S10-365 C10-365 S11-365
## -0.0159 -0.0099 -0.0321 0.0107 -0.0233 0.0144 -0.0316 0.0175
## s.e. 0.0134 0.0117 0.0117 0.0105 0.0105 0.0094 0.0094 0.0086
## C11-365 S12-365 C12-365
## -0.0152 0.0325 -0.0212
## s.e. 0.0086 0.0079 0.0079
##
## sigma^2 = 0.02578: log likelihood = 2897.52
## AIC=-5705.04 AICc=-5704.45 BIC=-5396.56
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0001538115 0.160034 0.08452967 -0.008480523 0.6104601 0.3174897
## ACF1
## Training set -0.04823666
arima_orders_tot <- c(baseline_arima_tot$arma[1], baseline_arima_tot$arma[6], baseline_arima_tot$arma[2])
pulse_train <- 1*(seq(train_set$total_rides) == 6708)
first.model.x <- arimax(log(train_set$total_rides), order=arima_orders_tot, xtransf=data.frame(pulse_train), xreg=fourier_terms_tot_train, transfer=list(c(1,0)))
summary(first.model.x)
##
## Call:
## arimax(x = log(train_set$total_rides), order = arima_orders_tot, xreg = fourier_terms_tot_train,
## xtransf = data.frame(pulse_train), transfer = list(c(1, 0)))
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ar1 ar2 S1.7 C1.7 S2.7 C2.7 S3.7 C3.7
## -0.4156 -0.2637 0.0934 -0.3216 0.1024 -0.1930 0.0499 -0.0872
## s.e. 0.0106 0.0106 0.0021 0.0021 0.0022 0.0022 0.0016 0.0016
## S1.30 C1.30 S2.30 C2.30 S3.30 C3.30 S4.30 C4.30 S5.30
## -0.0007 -0.0018 0.0035 0.0006 0.0011 0.0030 0.0015 0.0000 5e-04
## s.e. 0.0071 0.0071 0.0037 0.0037 0.0027 0.0027 0.0022 0.0022 2e-03
## C5.30 S6.30 C6.30 S1.365 C1.365 S2.365 C2.365 S3.365 C3.365
## 1e-04 -3e-04 -2e-04 -0.0325 -0.0453 0.0079 -0.0610 0.0247 -0.0440
## s.e. 2e-03 2e-03 2e-03 0.0855 0.0855 0.0428 0.0428 0.0285 0.0285
## S4.365 C4.365 S5.365 C5.365 S6.365 C6.365 S7.365 C7.365
## 0.0080 -0.0199 -0.0070 -0.0159 0.0035 -0.0158 0.0124 -0.0159
## s.e. 0.0214 0.0214 0.0171 0.0171 0.0143 0.0143 0.0123 0.0123
## S8.365 C8.365 S9.365 C9.365 S10.365 C10.365 S11.365 C11.365
## -0.0084 -0.0338 0.0095 -0.0244 0.0120 -0.0316 0.0194 -0.0140
## s.e. 0.0108 0.0108 0.0096 0.0096 0.0086 0.0086 0.0079 0.0079
## S12.365 C12.365 pulse_train-AR1 pulse_train-MA0
## 0.0326 -0.0205 0.0033 0.0017
## s.e. 0.0072 0.0072 NaN NaN
##
## sigma^2 estimated as 0.02563: log likelihood = 3469.78, aic = -6847.56
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0001207814 0.160072 0.08922034 -0.008687299 0.6516897 0.3467526
## ACF1
## Training set -0.04842449
steps.ahead = length(test_set$total_rides)
intervention_index <- 6708
decay_rate <- 0.0005 # Adjust decay rate as needed
# Create the slowly decaying ramp function
create_ramp <- function(length, start_idx, decay_rate) {
ramp <- numeric(length)
for (i in start_idx:length) {
ramp[i] <- exp(-decay_rate * (i - start_idx))
}
return(ramp)
}
# Create the ramp function for the entire period (train + test)
total_length <- length(train_set$bus) + steps.ahead
ramp_function <- create_ramp(total_length, intervention_index, decay_rate)
tf <- -ramp_function
xreg_train <- cbind(fourier_terms_tot_train, tf[1:(length(tf) - steps.ahead)])
tot.forecast.arima <- Arima(log(train_set$total_rides), order=arima_orders_tot, xreg = xreg_train)
tot.forecast.arima
## Series: log(train_set$total_rides)
## Regression with ARIMA(2,1,0) errors
##
## Coefficients:
## ar1 ar2 S1-7 C1-7 S2-7 C2-7 S3-7 C3-7
## -0.4156 -0.2636 0.0934 -0.3216 0.1024 -0.1930 0.0499 -0.0872
## s.e. 0.0106 0.0106 0.0021 0.0021 0.0022 0.0022 0.0016 0.0016
## S1-30 C1-30 S2-30 C2-30 S3-30 C3-30 S4-30 C4-30 S5-30
## -0.0008 -0.0018 0.0034 0.0007 0.0011 0.0030 0.0015 0.0000 5e-04
## s.e. 0.0071 0.0071 0.0037 0.0037 0.0027 0.0027 0.0022 0.0022 2e-03
## C5-30 S6-30 C6-30 S1-365 C1-365 S2-365 C2-365 S3-365 C3-365
## 1e-04 -3e-04 -2e-04 -0.0289 -0.0449 0.0094 -0.0617 0.0260 -0.0440
## s.e. 2e-03 2e-03 2e-03 0.0856 0.0855 0.0428 0.0428 0.0285 0.0285
## S4-365 C4-365 S5-365 C5-365 S6-365 C6-365 S7-365 C7-365
## 0.0088 -0.0199 -0.0065 -0.0159 0.0039 -0.0158 0.0127 -0.0160
## s.e. 0.0214 0.0214 0.0171 0.0171 0.0143 0.0143 0.0123 0.0123
## S8-365 C8-365 S9-365 C9-365 S10-365 C10-365 S11-365 C11-365
## -0.0082 -0.0338 0.0096 -0.0244 0.0121 -0.0316 0.0194 -0.0140
## s.e. 0.0108 0.0108 0.0096 0.0096 0.0086 0.0086 0.0079 0.0079
## S12-365 C12-365
## 0.0326 -0.0204 -0.0100
## s.e. 0.0072 0.0072 0.1443
##
## sigma^2 = 0.02576: log likelihood = 3469.79
## AIC=-6847.58 AICc=-6847.06 BIC=-6523.93
start_idx <- length(tf) - steps.ahead + 1
xreg_forecast <- cbind(fourier_terms_tot_test, tf[start_idx:length(tf)])
pred <- predict(tot.forecast.arima, n.ahead = steps.ahead, newxreg = xreg_forecast)
preds <- exp(pred$pred)
tot_plot_data <- data.frame(
Date = test_set$service_date,
Actual = test_set$total_rides,
Forecasted = preds
)
ggplot(tot_plot_data, aes(x = Date)) +
geom_line(aes(y = Actual, color = "Actual")) +
geom_line(aes(y = Forecasted, color = "Forecasted")) +
labs(title = "Total Boardings: Actual vs Forecasted",
y = "Boardings",
x = "Date (2024)") +
scale_color_manual(name = "Legend", values = c("Actual" = "blue", "Forecasted" = "red")) +
theme_minimal()
rmse_value <- rmse(tot_plot_data$Actual, tot_plot_data$Forecasted)
mae_value <- mae(tot_plot_data$Actual, tot_plot_data$Forecasted)
mape_value <- mape(tot_plot_data$Actual, tot_plot_data$Forecasted)
# Calculate Coverage
tolerance <- 0.3
coverage_value <- mean(abs(tot_plot_data$Actual - tot_plot_data$Forecasted) / tot_plot_data$Actual <= tolerance) * 100
# Print the results
cat("2024 Forecasts for Rail Ridership Metrics \n")
## 2024 Forecasts for Rail Ridership Metrics
cat("RMSE:", rmse_value, "\n")
## RMSE: 342775.1
cat("MAE:", mae_value, "\n")
## MAE: 310560.4
cat("MAPE:", mape_value, "\n")
## MAPE: 0.4693072
cat("Coverage:", coverage_value, "%\n")
## Coverage: 33.33333 %